# load packages, installing if missing
if (!require(librarian)){
install.packages("librarian")
library(librarian)
}
## Loading required package: librarian
librarian::shelf(
dismo, dplyr, DT, ggplot2, here, htmltools, leaflet, mapview, purrr, raster, readr, rgbif, rgdal, rJava, sdmpredictors, sf, spocc, tidyr)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = FALSE)
# set random seed for reproducibility
set.seed(42)
# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F, recursive = T)
obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo <- FALSE
if (!file.exists(obs_geo) | redo){
# get species occurrence data from GBIF with coordinates
(res <- spocc::occ(
query = 'Haliaeetus leucocephalus',
from = 'gbif', has_coords = T, limit = 10000))
# extract data frame from result
df <- res$gbif$data[[1]]
readr::write_csv(df, obs_csv)
# convert to points of observation from lon/lat columns in data frame
obs <- df %>%
sf::st_as_sf(
coords = c("longitude", "latitude"),
crs = st_crs(4326)) %>%
select(prov, key) # save space (joinable from obs_csv)
sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 10000
# show points on map
mapview::mapview(obs, map.types = "Esri.WorldPhysical")
dir_env <- file.path(dir_data, "env")
# set a default data directory
options(sdmpredictors_datadir = dir_env)
# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)
# show table of datasets
env_datasets %>%
select(dataset_code, description, citation) %>%
DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")
# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio2", "ER_tri", "ER_topoWet")
# get layers
env_stack <- load_layers(env_layers_vec)
# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)
# crop the environmental rasters to a reasonable study area around our species observations
obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
if (!file.exists(obs_hull_geo) | redo){
# make convex hull around points of observation
obs_hull <- sf::st_convex_hull(st_union(obs))
# save obs hull
write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)
# show points on map
mapview(
list(obs, obs_hull))
if (!file.exists(env_stack_grd) | redo){
obs_hull_sp <- sf::as_Spatial(obs_hull)
env_stack <- raster::mask(env_stack, obs_hull_sp) %>%
raster::crop(extent(obs_hull_sp))
writeRaster(env_stack, env_stack_grd, overwrite=T)
}
env_stack <- stack(env_stack_grd)
# show map
# mapview(obs) +
# mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)
absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
if (!file.exists(absence_geo) | redo){
# get raster count of observations
r_obs <- rasterize(
sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
# show map
# mapview(obs) +
# mapview(r_obs)
# create mask for
r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
# generate random points inside mask
absence <- dismo::randomPoints(r_mask, nrow(obs)) %>%
as_tibble() %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)
# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") +
mapview(absence, col.regions = "gray")
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)
if (!file.exists(pts_env_csv) | redo){
# combine presence and absence into single set of labeled points
pts <- rbind(
obs %>%
mutate(
present = 1) %>%
select(present, key),
absence %>%
mutate(
present = 0,
key = NA)) %>%
mutate(
ID = 1:n()) %>%
relocate(ID)
write_sf(pts, pts_geo, delete_dsn=T)
# extract raster values for points
pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>%
tibble() %>%
# join present and geometry columns to raster value results for points
left_join(
pts %>%
select(ID, present),
by = "ID") %>%
relocate(present, .after = ID) %>%
# extract lon, lat as single columns
mutate(
#present = factor(present),
lon = st_coordinates(geometry)[,1],
lat = st_coordinates(geometry)[,2]) %>%
select(-geometry)
write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)
pts_env %>%
# show first 10 presence, last 10 absence
slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>%
DT::datatable(
rownames = F,
options = list(
dom = "t",
pageLength = 20))
nrow(pts_env)
## [1] 20000
datatable(pts_env, rownames = F)
pts_env %>%
select(-ID) %>%
mutate(
present = factor(present)) %>%
pivot_longer(-present) %>%
ggplot() +
geom_density(aes(x = value, fill = present)) +
scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
theme_bw() +
facet_wrap(~name, scales = "free") +
theme(
legend.position = c(1, 0),
legend.justification = c(1, 0))
## Warning: Removed 147 rows containing non-finite values (stat_density).
## Pairs plot to show correlations between variables
GGally::ggpairs(
select(pts_env, -ID),
aes(color = factor(present)))
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 24 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 24 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 24 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 60 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 15 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 24 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 24 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 66 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 28 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 24 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 24 rows containing missing values
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 24 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 66 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 28 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 24 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 24 rows containing missing values
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 66 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 28 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 24 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 24 rows containing missing values
## Warning: Removed 60 rows containing missing values (geom_point).
## Warning: Removed 66 rows containing missing values (geom_point).
## Warning: Removed 66 rows containing missing values (geom_point).
## Warning: Removed 66 rows containing missing values (geom_point).
## Warning: Removed 60 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 60 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 60 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 60 rows containing missing values
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 28 rows containing missing values (geom_point).
## Warning: Removed 28 rows containing missing values (geom_point).
## Warning: Removed 28 rows containing missing values (geom_point).
## Warning: Removed 60 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 15 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 15 rows containing missing values
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 60 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 60 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
# Setup Data - Drop rows with any NAs - remove terms we don’t want to model - use a simplified formula
_present_ ~. to predict where the species is present based on all other fields in the data from (ie. y ~ X1 + X2 + …Xn)
d <- pts_env %>%
select(-ID) %>% # remove terms we don't want to model
tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 19934
# fit a linear model
mdl_linear <- lm(present ~ ., data = d)
summary(mdl_linear)
##
## Call:
## lm(formula = present ~ ., data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.28636 -0.33804 0.04561 0.35671 1.32955
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.762e-01 1.111e-01 3.388 0.000707 ***
## WC_alt 1.445e-04 1.195e-05 12.088 < 2e-16 ***
## WC_bio1 6.831e-02 2.230e-03 30.636 < 2e-16 ***
## WC_bio2 -6.595e-02 2.071e-03 -31.844 < 2e-16 ***
## ER_tri -2.455e-03 1.724e-04 -14.238 < 2e-16 ***
## ER_topoWet -5.597e-02 3.489e-03 -16.045 < 2e-16 ***
## lon 6.501e-03 2.897e-04 22.442 < 2e-16 ***
## lat 3.724e-02 1.976e-03 18.840 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4322 on 19926 degrees of freedom
## Multiple R-squared: 0.2532, Adjusted R-squared: 0.253
## F-statistic: 965.3 on 7 and 19926 DF, p-value: < 2.2e-16
y_predict <- predict(mdl_linear, d, type="response")
y_true <- d$present
range(y_predict)
## [1] -0.329554 1.286362
range(y_true)
## [1] 0 1
# fit a generalized linear model with a binomial logit link function
mdl_glm <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl_glm)
##
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"),
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7969 -0.8593 -0.1855 0.9010 2.9607
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.763e+00 5.985e-01 -2.945 0.00323 **
## WC_alt 7.552e-04 6.628e-05 11.394 < 2e-16 ***
## WC_bio1 3.629e-01 1.250e-02 29.038 < 2e-16 ***
## WC_bio2 -3.241e-01 1.199e-02 -27.020 < 2e-16 ***
## ER_tri -1.305e-02 9.681e-04 -13.477 < 2e-16 ***
## ER_topoWet -2.757e-01 1.888e-02 -14.602 < 2e-16 ***
## lon 3.354e-02 1.583e-03 21.183 < 2e-16 ***
## lat 2.077e-01 1.088e-02 19.087 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27634 on 19933 degrees of freedom
## Residual deviance: 21989 on 19926 degrees of freedom
## AIC: 22005
##
## Number of Fisher Scoring iterations: 4
y_predict_glm <- predict(mdl_glm, d, type = "response")
range(y_predict_glm)
## [1] 0.01209068 0.97998789
Look at the terms plots to see the relationship between predictor and response
# show term plots
termplot(mdl_glm, partial.resid = TRUE, se = TRUE, main = F, ylim = "free")
## Generalize Additive Model With a general additive model we can add “wiggle” to the relationship between predictor and response by introducing smooth s() terms
librarian::shelf(mgcv)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
# fit a generalized additive model with smooth predictors
mdl_gen_add <- mgcv::gam(
formula = present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat),
family = binomial, data = d)
summary(mdl_gen_add)
##
## Family: binomial
## Link function: logit
##
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) +
## s(lon) + s(lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1347 0.0357 -3.772 0.000162 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(WC_alt) 8.762 8.979 490.73 <2e-16 ***
## s(WC_bio1) 7.950 8.455 349.27 <2e-16 ***
## s(WC_bio2) 8.767 8.980 462.99 <2e-16 ***
## s(ER_tri) 8.772 8.984 112.77 <2e-16 ***
## s(ER_topoWet) 8.454 8.894 73.74 <2e-16 ***
## s(lon) 7.486 8.449 163.18 <2e-16 ***
## s(lat) 8.872 8.992 229.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.441 Deviance explained = 38.8%
## UBRE = -0.14616 Scale est. = 1 n = 19934
# show term plot
plot(mdl_gen_add, scale=0)